libname hold 'C:\andrew\Stats 747\2008'; ** import cafe data; PROC IMPORT OUT= work.cafe DATAFILE= "C:\andrew\Stats 747\2008\cafe.csv" DBMS=CSV REPLACE; GETNAMES=YES; DATAROW=2; RUN; proc contents data = cafe; run; ** check data & clean; proc freq data = cafe; table q1 q2 q3 q7 q8 q4a1-q4a8 q5a q5b q6a_1-q6a_10; run; ** clean this data; ** temp dats set; data cafe2; set cafe; label q6a_1="6(a) Convenient Hours" q6a_2="6(b) Speed of Service" q6a_3="6(c) Value for Money" q6a_4="6(d) Employee Friendliness" q6a_5="6(e) Cleanliness of the facility" q6a_6="6(f) Selection of Food" q6a_7="6(g) Appearance of Food" q6a_8="6(h) Freshness of Food" q6a_9="6(i) Healthy Choices" q6a_10="6(j) Availability of Nutritional Information" q7 = "Age" q8 = "Gender" ; if q1>13 then q1=13; * max # coffees =13; array q6 q6a_1-q6a_10; * replace 8's with missings; do over q6; if q6=8 then q6=6; end; run; ** check changes; proc freq data = cafe2; table q1 q6a_1-q6a_10; run; ** k means cluster - do this many times and find the most most usefukl solution - change random seed (below it's 456); proc fastclus data=cafe2 maxc=3 replace=random random=456 out=clusters; var q6a_1-q6a_10; run; ** investigate; proc sort data =clusters; by cluster; proc means data =clusters mean; by cluster; var q6a_1-q6a_10; run; ** hmm perhaps easier to see if newq6 is binary for high value; data clusters2; set clusters; array q6 q6a_1-q6a_10; array binq6 binq6a_1-binq6a_10; do i = 1 to 10; binq6(i)=0; if q6(i)<=3 then binq6(i)=1; end; drop i; run; *check; proc freq data =clusters2; table q6a_1 binq6a_1 q6a_2 binq6a_2 q6a_3 binq6a_3 q6a_4 binq6a_4 q6a_5 binq6a_5 q6a_6 binq6a_6 q6a_7 binq6a_7 q6a_8 binq6a_8 q6a_9 binq6a_9 q6a_10 binq6a_10 ; run; data clusters2; set clusters2; label binq6a_1="6(a) Convenient Hours" binq6a_2="6(b) Speed of Service" binq6a_3="6(c) Value for Money" binq6a_4="6(d) Employee Friendliness" binq6a_5="6(e) Cleanliness of the facility" binq6a_6="6(f) Selection of Food" binq6a_7="6(g) Appearance of Food" binq6a_8="6(h) Freshness of Food" binq6a_9="6(i) Healthy Choices" binq6a_10="6(j) Availability of Nutritional Information" ; run; proc means data =clusters2 mean; class cluster; var binq6a_1-binq6a_10; run; ** good for differentiating clusters by solution; proc tabulate data = clusters2; class cluster; var binq6a_1-binq6a_10; table binq6a_1 binq6a_2 binq6a_3 binq6a_4 binq6a_5 binq6a_6 binq6a_7 binq6a_8 binq6a_9 binq6a_10, MEAN*cluster; run; ** original data; proc tabulate data = clusters2; class cluster; var q6a_1-q6a_10; table q6a_1 q6a_2 q6a_3 q6a_4 q6a_5 q6a_6 q6a_7 q6a_8 q6a_9 q6a_10, MEAN*cluster; run; proc tabulate data = clusters2; class cluster; var q1 q2 q3 q7 q8 q4a1-q4a8 q5a q5b; table q1 q2 q3 q7 q8 q4a1 q4a2 q4a3 q4a4 q4a5 q4a6 q4a7 q4a8 q5a q5b,MEAN*cluster; run; ** discriminate these segments; ** note q4a3-8 are all zeroes; proc discrim data =clusters2; class cluster; var q1 q2 q3 q7 q8 q4a1-q4a2 q5a q5b; run;